#from google.colab import drive
#drive.mount('/content/drive')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# Read CSV tanpa gdrive
df = pd.read_csv('flight.csv')
df.sample(10)
| MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | SUM_YR_2 | SEG_KM_SUM | LAST_FLIGHT_DATE | LAST_TO_END | AVG_INTERVAL | MAX_INTERVAL | EXCHANGE_COUNT | avg_discount | Points_Sum | Point_NotFlight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 62774 | 1931 | 2/1/2013 | 4/25/2013 | Male | 4 | shanghai | shanghai | CN | 31.0 | 3/31/2014 | ... | 242.0 | 1365 | 8/2/2013 | 243 | 99.000000 | 99 | 0 | 0.405751 | 270 | 0 |
| 20916 | 16269 | 10/22/2012 | 10/22/2012 | Female | 4 | guangzhou | guangdong | CN | 30.0 | 3/31/2014 | ... | 2954.0 | 18157 | 3/24/2014 | 8 | 64.875000 | 233 | 0 | 0.644475 | 7112 | 0 |
| 38227 | 59525 | 9/3/2011 | 9/3/2011 | Male | 4 | shenzhen | guangdong | CN | 38.0 | 3/31/2014 | ... | 0.0 | 7031 | 1/5/2013 | 452 | 26.000000 | 183 | 0 | 0.718554 | 4518 | 0 |
| 9292 | 51569 | 6/17/2012 | 6/17/2012 | Female | 4 | shenzhen | guangdong | CN | 37.0 | 3/31/2014 | ... | 13789.0 | 31055 | 8/10/2013 | 235 | 27.933333 | 116 | 0 | 0.743775 | 21274 | 0 |
| 22692 | 54825 | 7/2/2006 | 11/24/2009 | Male | 4 | beijing | beijing | CN | 64.0 | 3/31/2014 | ... | 1733.0 | 11457 | 12/6/2013 | 117 | 80.833333 | 264 | 0 | 0.935604 | 10686 | 0 |
| 8194 | 27608 | 1/10/2009 | 1/10/2009 | Male | 5 | guangzhou | guangdong | CN | 36.0 | 3/31/2014 | ... | 20371.0 | 33211 | 3/20/2014 | 12 | 17.351351 | 111 | 2 | 0.757236 | 39264 | 37 |
| 11079 | 54696 | 3/10/2006 | 10/1/2007 | Male | 4 | guangzhou | guangdong | CN | NaN | 3/31/2014 | ... | 6575.0 | 29565 | 3/30/2014 | 2 | 31.652174 | 146 | 1 | 0.688513 | 23140 | 20 |
| 50959 | 60915 | 11/12/2012 | 4/3/2013 | Male | 4 | shanghai | shanghai | CN | 28.0 | 3/31/2014 | ... | 1160.0 | 4581 | 11/6/2013 | 147 | 54.250000 | 140 | 0 | 0.569291 | 1757 | 0 |
| 53572 | 20133 | 12/2/2004 | 3/4/2005 | Male | 4 | shanghai | shanghai | CN | 42.0 | 3/31/2014 | ... | 0.0 | 2616 | 2/22/2013 | 404 | 11.000000 | 11 | 0 | 0.850000 | 2036 | 0 |
| 41084 | 59392 | 6/16/2011 | 6/16/2011 | Male | 4 | beijing | beijing | CN | 37.0 | 3/31/2014 | ... | 0.0 | 4416 | 1/19/2013 | 438 | 110.500000 | 217 | 0 | 1.000000 | 3446 | 0 |
10 rows × 23 columns
#link= '/content/drive/My Drive/DS 29 - Rakamin Final Project/PR Unsupervised Learning/flight.csv'
#!ls '/content/drive/My Drive/DS 29 - Rakamin Final Project/PR Unsupervised Learning/flight.csv'
#df = pd.read_csv(link)
#df
df.isna().sum()
MEMBER_NO 0 FFP_DATE 0 FIRST_FLIGHT_DATE 0 GENDER 3 FFP_TIER 0 WORK_CITY 2269 WORK_PROVINCE 3248 WORK_COUNTRY 26 AGE 420 LOAD_TIME 0 FLIGHT_COUNT 0 BP_SUM 0 SUM_YR_1 551 SUM_YR_2 138 SEG_KM_SUM 0 LAST_FLIGHT_DATE 0 LAST_TO_END 0 AVG_INTERVAL 0 MAX_INTERVAL 0 EXCHANGE_COUNT 0 avg_discount 0 Points_Sum 0 Point_NotFlight 0 dtype: int64
df = df.drop_duplicates()
df = df.dropna()
df.isna().sum()
MEMBER_NO 0 FFP_DATE 0 FIRST_FLIGHT_DATE 0 GENDER 0 FFP_TIER 0 WORK_CITY 0 WORK_PROVINCE 0 WORK_COUNTRY 0 AGE 0 LOAD_TIME 0 FLIGHT_COUNT 0 BP_SUM 0 SUM_YR_1 0 SUM_YR_2 0 SEG_KM_SUM 0 LAST_FLIGHT_DATE 0 LAST_TO_END 0 AVG_INTERVAL 0 MAX_INTERVAL 0 EXCHANGE_COUNT 0 avg_discount 0 Points_Sum 0 Point_NotFlight 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 58255 entries, 0 to 62986 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 58255 non-null int64 1 FFP_DATE 58255 non-null object 2 FIRST_FLIGHT_DATE 58255 non-null object 3 GENDER 58255 non-null object 4 FFP_TIER 58255 non-null int64 5 WORK_CITY 58255 non-null object 6 WORK_PROVINCE 58255 non-null object 7 WORK_COUNTRY 58255 non-null object 8 AGE 58255 non-null float64 9 LOAD_TIME 58255 non-null object 10 FLIGHT_COUNT 58255 non-null int64 11 BP_SUM 58255 non-null int64 12 SUM_YR_1 58255 non-null float64 13 SUM_YR_2 58255 non-null float64 14 SEG_KM_SUM 58255 non-null int64 15 LAST_FLIGHT_DATE 58255 non-null object 16 LAST_TO_END 58255 non-null int64 17 AVG_INTERVAL 58255 non-null float64 18 MAX_INTERVAL 58255 non-null int64 19 EXCHANGE_COUNT 58255 non-null int64 20 avg_discount 58255 non-null float64 21 Points_Sum 58255 non-null int64 22 Point_NotFlight 58255 non-null int64 dtypes: float64(5), int64(10), object(8) memory usage: 10.7+ MB
Ada beberapa datatype yang tidak sesuai seperti:
df['AGE'] = df['AGE'].astype(int)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 58255 entries, 0 to 62986 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 58255 non-null int64 1 FFP_DATE 58255 non-null object 2 FIRST_FLIGHT_DATE 58255 non-null object 3 GENDER 58255 non-null object 4 FFP_TIER 58255 non-null int64 5 WORK_CITY 58255 non-null object 6 WORK_PROVINCE 58255 non-null object 7 WORK_COUNTRY 58255 non-null object 8 AGE 58255 non-null int32 9 LOAD_TIME 58255 non-null object 10 FLIGHT_COUNT 58255 non-null int64 11 BP_SUM 58255 non-null int64 12 SUM_YR_1 58255 non-null float64 13 SUM_YR_2 58255 non-null float64 14 SEG_KM_SUM 58255 non-null int64 15 LAST_FLIGHT_DATE 58255 non-null object 16 LAST_TO_END 58255 non-null int64 17 AVG_INTERVAL 58255 non-null float64 18 MAX_INTERVAL 58255 non-null int64 19 EXCHANGE_COUNT 58255 non-null int64 20 avg_discount 58255 non-null float64 21 Points_Sum 58255 non-null int64 22 Point_NotFlight 58255 non-null int64 dtypes: float64(4), int32(1), int64(10), object(8) memory usage: 10.4+ MB
df[df['LAST_FLIGHT_DATE']=='2014/2/29 0:00:00']
df['LAST_FLIGHT_DATE'] = df['LAST_FLIGHT_DATE'].replace('2014/2/29 0:00:00', '2/29/2014')
df[df['LAST_FLIGHT_DATE']=='2/29/2014']
| MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | SUM_YR_2 | SEG_KM_SUM | LAST_FLIGHT_DATE | LAST_TO_END | AVG_INTERVAL | MAX_INTERVAL | EXCHANGE_COUNT | avg_discount | Points_Sum | Point_NotFlight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 248 | 56497 | 6/17/2007 | 6/17/2007 | Female | 6 | guangzhou | guangdong | CN | 46 | 3/31/2014 | ... | 70845.0 | 85000 | 2/29/2014 | 32 | 13.250000 | 57 | 0 | 1.373478 | 132653 | 4 |
| 447 | 17306 | 8/26/2009 | 4/29/2010 | Male | 5 | haerbin | heilongjiang | CN | 70 | 3/31/2014 | ... | 46323.0 | 69453 | 2/29/2014 | 32 | 22.766667 | 75 | 6 | 1.375395 | 101413 | 7 |
| 473 | 915 | 11/10/2006 | 1/26/2007 | Male | 5 | . | beijing | CN | 39 | 3/31/2014 | ... | 30580.0 | 65717 | 2/29/2014 | 32 | 14.250000 | 65 | 1 | 1.425974 | 93153 | 8 |
| 832 | 20336 | 7/13/2005 | 8/6/2005 | Male | 5 | guangzhou | guangdong | CN | 40 | 3/31/2014 | ... | 36417.0 | 114222 | 2/29/2014 | 32 | 8.696203 | 34 | 2 | 0.664970 | 91476 | 15 |
| 1051 | 62884 | 5/24/2006 | 4/21/2007 | Male | 5 | wuhan | hubei | CN | 50 | 3/31/2014 | ... | 35680.0 | 49859 | 2/29/2014 | 32 | 19.115385 | 72 | 4 | 1.388584 | 69047 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 61388 | 4076 | 4/8/2009 | 4/11/2009 | Male | 4 | wulumuqi | xinjiang | CN | 45 | 3/31/2014 | ... | 783.0 | 3243 | 2/29/2014 | 32 | 459.000000 | 459 | 1 | 0.308511 | 2180 | 2 |
| 61539 | 55995 | 1/11/2013 | 7/10/2013 | Male | 4 | dalianshi | liaoning | CN | 23 | 3/31/2014 | ... | 955.0 | 3513 | 2/29/2014 | 32 | 117.000000 | 211 | 0 | 0.276667 | 0 | 0 |
| 61871 | 48742 | 7/26/2009 | 7/26/2009 | Male | 4 | tianjin | tianjin | CN | 37 | 3/31/2014 | ... | 966.0 | 1298 | 2/29/2014 | 32 | 135.000000 | 135 | 0 | 0.690000 | 1094 | 1 |
| 61909 | 9190 | 4/11/2011 | 4/20/2011 | Male | 4 | guangzhou | guangdong | CN | 34 | 3/31/2014 | ... | 853.0 | 2595 | 2/29/2014 | 32 | 1.000000 | 1 | 0 | 0.340000 | 3356 | 19 |
| 62721 | 38776 | 10/31/2007 | 3/13/2008 | Male | 4 | foshan | guangdong | CN | 42 | 3/31/2014 | ... | 688.0 | 1199 | 2/29/2014 | 32 | 78.000000 | 78 | 0 | 0.489525 | 276 | 0 |
395 rows × 23 columns
date = ['FFP_DATE', 'FIRST_FLIGHT_DATE','LAST_FLIGHT_DATE', 'LOAD_TIME']
#df['LAST_FLIGHT_DATE'] = pd.to_datetime(df['LAST_FLIGHT_DATE'], format='%m/%d/%Y')
for i in date:
df[i] = pd.to_datetime(df[i], format='%m/%d/%Y', errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 58255 entries, 0 to 62986 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 58255 non-null int64 1 FFP_DATE 58255 non-null datetime64[ns] 2 FIRST_FLIGHT_DATE 58255 non-null datetime64[ns] 3 GENDER 58255 non-null object 4 FFP_TIER 58255 non-null int64 5 WORK_CITY 58255 non-null object 6 WORK_PROVINCE 58255 non-null object 7 WORK_COUNTRY 58255 non-null object 8 AGE 58255 non-null int32 9 LOAD_TIME 58255 non-null datetime64[ns] 10 FLIGHT_COUNT 58255 non-null int64 11 BP_SUM 58255 non-null int64 12 SUM_YR_1 58255 non-null float64 13 SUM_YR_2 58255 non-null float64 14 SEG_KM_SUM 58255 non-null int64 15 LAST_FLIGHT_DATE 57860 non-null datetime64[ns] 16 LAST_TO_END 58255 non-null int64 17 AVG_INTERVAL 58255 non-null float64 18 MAX_INTERVAL 58255 non-null int64 19 EXCHANGE_COUNT 58255 non-null int64 20 avg_discount 58255 non-null float64 21 Points_Sum 58255 non-null int64 22 Point_NotFlight 58255 non-null int64 dtypes: datetime64[ns](4), float64(4), int32(1), int64(10), object(4) memory usage: 10.4+ MB
Karena pada conversi ke Date menggunakan 'errors='coerce' maka pasti ada data yang NaN jika formatnya tidak sesuai
df.isna().sum()
MEMBER_NO 0 FFP_DATE 0 FIRST_FLIGHT_DATE 0 GENDER 0 FFP_TIER 0 WORK_CITY 0 WORK_PROVINCE 0 WORK_COUNTRY 0 AGE 0 LOAD_TIME 0 FLIGHT_COUNT 0 BP_SUM 0 SUM_YR_1 0 SUM_YR_2 0 SEG_KM_SUM 0 LAST_FLIGHT_DATE 395 LAST_TO_END 0 AVG_INTERVAL 0 MAX_INTERVAL 0 EXCHANGE_COUNT 0 avg_discount 0 Points_Sum 0 Point_NotFlight 0 dtype: int64
Data pada LAST_FLIGHT_DATE yang tidak sesuai format adalah tanggal 29 Februari 2014 karena di awal sudah dilakukan pengecekan jumlah pada data tersebut dan jumlah datanya sama. Data pada tanggal tersebut memang tidak terdapat dalam penanggalan kalender kabisat, sehingga wajar saja jika formatnya tidak sesuai.
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 57860 entries, 0 to 62986 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 57860 non-null int64 1 FFP_DATE 57860 non-null datetime64[ns] 2 FIRST_FLIGHT_DATE 57860 non-null datetime64[ns] 3 GENDER 57860 non-null object 4 FFP_TIER 57860 non-null int64 5 WORK_CITY 57860 non-null object 6 WORK_PROVINCE 57860 non-null object 7 WORK_COUNTRY 57860 non-null object 8 AGE 57860 non-null int32 9 LOAD_TIME 57860 non-null datetime64[ns] 10 FLIGHT_COUNT 57860 non-null int64 11 BP_SUM 57860 non-null int64 12 SUM_YR_1 57860 non-null float64 13 SUM_YR_2 57860 non-null float64 14 SEG_KM_SUM 57860 non-null int64 15 LAST_FLIGHT_DATE 57860 non-null datetime64[ns] 16 LAST_TO_END 57860 non-null int64 17 AVG_INTERVAL 57860 non-null float64 18 MAX_INTERVAL 57860 non-null int64 19 EXCHANGE_COUNT 57860 non-null int64 20 avg_discount 57860 non-null float64 21 Points_Sum 57860 non-null int64 22 Point_NotFlight 57860 non-null int64 dtypes: datetime64[ns](4), float64(4), int32(1), int64(10), object(4) memory usage: 10.4+ MB
df['WORK_CITY'] = df['WORK_CITY'].replace('.', 'other')
df['WORK_PROVINCE'] = df['WORK_PROVINCE'].replace('.', 'other')
df[df['WORK_PROVINCE']=='other']
| MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | SUM_YR_2 | SEG_KM_SUM | LAST_FLIGHT_DATE | LAST_TO_END | AVG_INTERVAL | MAX_INTERVAL | EXCHANGE_COUNT | avg_discount | Points_Sum | Point_NotFlight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 37994 | 2004-11-13 | 2004-12-02 | Male | 6 | beijing | other | CN | 47 | 2014-03-31 | ... | 126821.0 | 271438 | 2014-03-26 | 6 | 5.027778 | 42 | 15 | 0.952535 | 353101 | 66 |
| 59 | 11034 | 2005-01-26 | 2005-03-09 | Male | 5 | beijing | other | CN | 59 | 2014-03-31 | ... | 85537.0 | 214921 | 2014-03-11 | 21 | 9.133333 | 34 | 5 | 0.819187 | 190530 | 13 |
| 536 | 21740 | 2010-03-01 | 2010-03-26 | Male | 4 | other | other | JP | 46 | 2014-03-31 | ... | 7443.0 | 78595 | 2014-03-30 | 2 | 12.068966 | 69 | 5 | 1.142706 | 88924 | 6 |
| 676 | 8346 | 2011-04-05 | 2011-05-27 | Male | 5 | yisilanbao | other | PK | 53 | 2014-03-31 | ... | 31664.0 | 75213 | 2014-01-31 | 61 | 18.028571 | 116 | 0 | 1.086663 | 78063 | 2 |
| 788 | 41908 | 2011-10-23 | 2011-10-23 | Male | 6 | beijing | other | CN | 53 | 2014-03-31 | ... | 34988.0 | 59280 | 2014-03-19 | 13 | 13.807692 | 90 | 4 | 1.304835 | 73270 | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62118 | 61405 | 2013-03-24 | 2013-03-24 | Male | 4 | shanghai | other | CN | 36 | 2014-03-31 | ... | 0.0 | 3321 | 2013-03-26 | 372 | 2.000000 | 2 | 0 | 0.250000 | 0 | 0 |
| 62191 | 62222 | 2011-11-15 | 2012-04-01 | Male | 4 | beijing | other | CN | 36 | 2014-03-31 | ... | 0.0 | 2307 | 2012-05-30 | 672 | 59.000000 | 59 | 0 | 0.351105 | 331 | 0 |
| 62252 | 52651 | 2011-09-28 | 2013-05-11 | Male | 4 | shanghai | other | CN | 54 | 2014-03-31 | ... | 922.0 | 1359 | 2013-05-13 | 324 | 2.000000 | 2 | 0 | 0.581604 | 506 | 0 |
| 62733 | 4973 | 2011-01-15 | 2011-03-25 | Male | 4 | beijing | other | CN | 31 | 2014-03-31 | ... | 0.0 | 1158 | 2013-01-14 | 443 | 10.000000 | 10 | 0 | 0.500000 | 200 | 0 |
| 62982 | 16415 | 2013-01-20 | 2013-01-20 | Female | 4 | beijing | other | CN | 35 | 2014-03-31 | ... | 0.0 | 3848 | 2013-01-20 | 437 | 0.000000 | 0 | 0 | 0.000000 | 0 | 0 |
912 rows × 23 columns
# Statistic numeric
df.describe()
| MEMBER_NO | FFP_TIER | AGE | FLIGHT_COUNT | BP_SUM | SUM_YR_1 | SUM_YR_2 | SEG_KM_SUM | LAST_TO_END | AVG_INTERVAL | MAX_INTERVAL | EXCHANGE_COUNT | avg_discount | Points_Sum | Point_NotFlight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.00000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 | 57860.000000 |
| mean | 31543.210854 | 4.104666 | 42.233253 | 12.043000 | 11047.843726 | 5363.816955 | 5679.279658 | 17324.371863 | 172.20598 | 67.963638 | 167.221673 | 0.327981 | 0.720626 | 12721.368960 | 2.819703 |
| std | 18182.130529 | 0.378206 | 9.763364 | 14.239523 | 16294.179086 | 8110.434363 | 8714.783954 | 20982.734648 | 180.80718 | 77.533059 | 122.901236 | 1.149762 | 0.183942 | 20621.601695 | 7.497873 |
| min | 3.000000 | 4.000000 | 6.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 368.000000 | 1.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 15794.750000 | 4.000000 | 35.000000 | 3.000000 | 2599.750000 | 1020.000000 | 833.000000 | 4882.000000 | 28.00000 | 23.666667 | 81.000000 | 0.000000 | 0.612019 | 2863.000000 | 0.000000 |
| 50% | 31591.500000 | 4.000000 | 41.000000 | 7.000000 | 5814.000000 | 2804.000000 | 2830.000000 | 10208.000000 | 105.00000 | 44.812500 | 144.000000 | 0.000000 | 0.711429 | 6468.000000 | 0.000000 |
| 75% | 47307.250000 | 4.000000 | 48.000000 | 15.000000 | 12976.250000 | 6584.000000 | 6931.000000 | 21519.000000 | 259.25000 | 82.000000 | 228.000000 | 0.000000 | 0.808333 | 14491.000000 | 1.000000 |
| max | 62988.000000 | 6.000000 | 110.000000 | 213.000000 | 505308.000000 | 239560.000000 | 234188.000000 | 580717.000000 | 731.00000 | 728.000000 | 728.000000 | 46.000000 | 1.500000 | 985572.000000 | 140.000000 |
Dari data numerik, terdapat hal yang aneh pada fitur avg_discount. Fitur tersebut memiliki nilai max 1.5 jika dipersenkan itu sama dengan 150%, sedangkan diskon seharusnya berkisar <= 100%.
# Statistic Categorical
cats = []
for i in df.columns:
if df[i].dtypes == 'object':
cats.append(i)
df[cats].describe()
| GENDER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | |
|---|---|---|---|---|
| count | 57860 | 57860 | 57860 | 57860 |
| unique | 2 | 2935 | 1123 | 105 |
| top | Male | guangzhou | guangdong | CN |
| freq | 44127 | 9223 | 17215 | 54572 |
# Pada keseluruhan data
plt.figure(figsize=(15,15))
sns.pairplot(df, diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x18bc8bdc430>
<Figure size 1080x1080 with 0 Axes>
# Pada data numerikal
nums = df.select_dtypes(include=[np.number], exclude=[np.bool]).columns
plt.figure(figsize=(16,9))
for i in range(0, len(nums)):
plt.subplot(3,5, i+1)
sns.kdeplot(data=df, x=df[nums[i]], fill='True')
plt.tight_layout()
Fitur avg_discount memiliki nilai lebih dari 1 (100%), sehingga akan dilakukan drop pada fitur tersebut yang memiliki nilai lebih dari 1.
# Melihat kde plot pada fitur avg_discount
ax = sns.kdeplot(x = df['avg_discount'])
ax.set_title("Distribution of Average Discount")
ax.set_ylabel("Density")
ax.set_xlabel("Discount")
plt.tight_layout()
# drop data pada fitur avg_discount yang memiliki fitur lebih dari 1
df = df[df['avg_discount']<=1]
df.sample(10)
| MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | SUM_YR_2 | SEG_KM_SUM | LAST_FLIGHT_DATE | LAST_TO_END | AVG_INTERVAL | MAX_INTERVAL | EXCHANGE_COUNT | avg_discount | Points_Sum | Point_NotFlight | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 62625 | 59991 | 2012-05-08 | 2013-05-11 | Female | 4 | guangzhou | guangdong | CN | 28 | 2014-03-31 | ... | 630.0 | 2572 | 2013-05-27 | 310 | 16.000000 | 16 | 0 | 0.250000 | 57 | 1 |
| 21930 | 48204 | 2008-06-03 | 2008-11-05 | Male | 4 | beijing | beijing | CN | 39 | 2014-03-31 | ... | 3389.0 | 15023 | 2014-03-14 | 18 | 55.454545 | 297 | 0 | 0.738198 | 10446 | 0 |
| 43517 | 39889 | 2009-03-09 | 2009-12-30 | Male | 4 | Hachioji | Tokyo | JP | 46 | 2014-03-31 | ... | 1086.0 | 5708 | 2013-08-24 | 221 | 141.000000 | 252 | 1 | 0.683075 | 20766 | 4 |
| 28918 | 62772 | 2005-01-01 | 2006-11-10 | Male | 4 | haerbin | heilongjiang | CN | 47 | 2014-03-31 | ... | 0.0 | 11732 | 2013-07-05 | 271 | 62.714286 | 127 | 0 | 0.671094 | 6448 | 0 |
| 40967 | 34694 | 2010-07-15 | 2010-07-15 | Male | 4 | shanghai | shanghai | CN | 29 | 2014-03-31 | ... | 3076.0 | 5094 | 2014-01-24 | 68 | 77.750000 | 203 | 0 | 0.871692 | 9659 | 12 |
| 18657 | 48988 | 2012-04-27 | 2012-04-28 | Male | 4 | xinxiang | henan | CN | 47 | 2014-03-31 | ... | 8857.0 | 19522 | 2014-01-23 | 69 | 48.846154 | 153 | 0 | 0.672240 | 12489 | 1 |
| 42136 | 866 | 2006-08-15 | 2007-03-16 | Male | 4 | changsha | hunan | CN | 47 | 2014-03-31 | ... | 0.0 | 5325 | 2012-10-23 | 526 | 61.000000 | 155 | 0 | 0.786516 | 3156 | 0 |
| 60869 | 14336 | 2012-07-02 | 2012-07-02 | Male | 4 | Gangnam-gu | Seoul | KR | 40 | 2014-03-31 | ... | 0.0 | 1159 | 2012-07-06 | 635 | 4.000000 | 4 | 0 | 0.949957 | 1000 | 0 |
| 34174 | 31250 | 2010-07-06 | 2010-11-18 | Male | 4 | xian | shanxisheng | CN | 39 | 2014-03-31 | ... | 2188.0 | 9658 | 2013-08-10 | 235 | 45.000000 | 136 | 0 | 0.636778 | 6599 | 2 |
| 54702 | 40364 | 2009-12-31 | 2009-12-31 | Male | 4 | shenzhen | guangdong | CN | 35 | 2014-03-31 | ... | 1895.0 | 3545 | 2013-07-26 | 250 | 71.000000 | 71 | 0 | 0.579083 | 1341 | 0 |
10 rows × 23 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 55236 entries, 0 to 62986 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 55236 non-null int64 1 FFP_DATE 55236 non-null datetime64[ns] 2 FIRST_FLIGHT_DATE 55236 non-null datetime64[ns] 3 GENDER 55236 non-null object 4 FFP_TIER 55236 non-null int64 5 WORK_CITY 55236 non-null object 6 WORK_PROVINCE 55236 non-null object 7 WORK_COUNTRY 55236 non-null object 8 AGE 55236 non-null int32 9 LOAD_TIME 55236 non-null datetime64[ns] 10 FLIGHT_COUNT 55236 non-null int64 11 BP_SUM 55236 non-null int64 12 SUM_YR_1 55236 non-null float64 13 SUM_YR_2 55236 non-null float64 14 SEG_KM_SUM 55236 non-null int64 15 LAST_FLIGHT_DATE 55236 non-null datetime64[ns] 16 LAST_TO_END 55236 non-null int64 17 AVG_INTERVAL 55236 non-null float64 18 MAX_INTERVAL 55236 non-null int64 19 EXCHANGE_COUNT 55236 non-null int64 20 avg_discount 55236 non-null float64 21 Points_Sum 55236 non-null int64 22 Point_NotFlight 55236 non-null int64 dtypes: datetime64[ns](4), float64(4), int32(1), int64(10), object(4) memory usage: 9.9+ MB
Tambahan
Membersihkan data sesuai dengan prosedur standar untuk dataset penerbangan
# drop kolom
df.drop(df[(df['SUM_YR_1'] == 0) & (df['SUM_YR_2'] == 0) & (df['avg_discount'] == 0) & (df['SEG_KM_SUM'] > 0)].index, inplace = True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 55229 entries, 0 to 62978 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MEMBER_NO 55229 non-null int64 1 FFP_DATE 55229 non-null datetime64[ns] 2 FIRST_FLIGHT_DATE 55229 non-null datetime64[ns] 3 GENDER 55229 non-null object 4 FFP_TIER 55229 non-null int64 5 WORK_CITY 55229 non-null object 6 WORK_PROVINCE 55229 non-null object 7 WORK_COUNTRY 55229 non-null object 8 AGE 55229 non-null int32 9 LOAD_TIME 55229 non-null datetime64[ns] 10 FLIGHT_COUNT 55229 non-null int64 11 BP_SUM 55229 non-null int64 12 SUM_YR_1 55229 non-null float64 13 SUM_YR_2 55229 non-null float64 14 SEG_KM_SUM 55229 non-null int64 15 LAST_FLIGHT_DATE 55229 non-null datetime64[ns] 16 LAST_TO_END 55229 non-null int64 17 AVG_INTERVAL 55229 non-null float64 18 MAX_INTERVAL 55229 non-null int64 19 EXCHANGE_COUNT 55229 non-null int64 20 avg_discount 55229 non-null float64 21 Points_Sum 55229 non-null int64 22 Point_NotFlight 55229 non-null int64 dtypes: datetime64[ns](4), float64(4), int32(1), int64(10), object(4) memory usage: 9.9+ MB
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(), cmap='Blues', annot=True, fmt='.2f')
<AxesSubplot:>
Fitur yang tidak memiliki pengaruh tidak signifikan, yaitu nilai korelasinya yang kurang dari 0.05 adalah fitur MEMBER_NO terhadap fitur lainnya, maka akan didrop.
df = df.drop(['MEMBER_NO'], axis=1)
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(), cmap='Blues', annot=True, fmt='.2f')
<AxesSubplot:>
Selanjutnya, berdasarkan dua jurnal mengenai airline passenger, yaitu:
data ini akan menggunakan model LRFMC. Dengan penjelasan:
Maka feature yang diambil :
# Fitur yang dipilih
feature = ['FFP_DATE', 'LOAD_TIME', 'FLIGHT_COUNT', 'avg_discount', 'SEG_KM_SUM', 'LAST_TO_END']
dfgo = df[feature].copy()
# Membuat fitur dari L (Loyalty) dari fitur yang telah dipilih)
dfgo['TIME_MONTH'] = ((dfgo['LOAD_TIME'] - dfgo['FFP_DATE']).dt.days/30).astype(int)
dfgo.sample(10)
| FFP_DATE | LOAD_TIME | FLIGHT_COUNT | avg_discount | SEG_KM_SUM | LAST_TO_END | TIME_MONTH | |
|---|---|---|---|---|---|---|---|
| 21025 | 2006-04-20 | 2014-03-31 | 12 | 0.655682 | 17740 | 132 | 96 |
| 61564 | 2012-11-10 | 2014-03-31 | 2 | 0.460000 | 2102 | 265 | 16 |
| 60681 | 2012-06-25 | 2014-03-31 | 2 | 0.647436 | 1755 | 605 | 21 |
| 37976 | 2006-03-16 | 2014-03-31 | 6 | 0.597093 | 8573 | 22 | 97 |
| 17024 | 2007-11-21 | 2014-03-31 | 22 | 0.778716 | 18346 | 73 | 77 |
| 14222 | 2005-12-27 | 2014-03-31 | 25 | 0.636692 | 26180 | 6 | 100 |
| 46694 | 2009-08-24 | 2014-03-31 | 2 | 0.888383 | 3736 | 551 | 56 |
| 31328 | 2009-12-01 | 2014-03-31 | 12 | 0.800508 | 8788 | 3 | 52 |
| 5355 | 2005-04-01 | 2014-03-31 | 32 | 0.654449 | 50083 | 25 | 109 |
| 45157 | 2011-11-29 | 2014-03-31 | 5 | 0.640427 | 5615 | 109 | 28 |
lrfmc = ['TIME_MONTH', 'LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']
dfgo_feat = dfgo[lrfmc].copy()
dfgo_feat.sample(10)
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| 49893 | 59 | 2 | 5 | 6638 | 0.417436 |
| 38309 | 54 | 199 | 7 | 9145 | 0.550576 |
| 11511 | 13 | 26 | 8 | 29663 | 0.665208 |
| 50526 | 83 | 547 | 3 | 4216 | 0.634298 |
| 31844 | 12 | 102 | 9 | 7914 | 0.867504 |
| 56938 | 25 | 705 | 2 | 3868 | 0.445514 |
| 50814 | 68 | 39 | 7 | 7895 | 0.333406 |
| 5149 | 46 | 13 | 37 | 50204 | 0.667615 |
| 12922 | 31 | 27 | 27 | 28767 | 0.627027 |
| 60528 | 41 | 87 | 2 | 1508 | 0.771797 |
plt.figure(figsize=(12, 6))
for i in range(len(lrfmc)):
plt.subplot(1, len(lrfmc), i+1)
sns.boxplot(y=dfgo[lrfmc[i]], color='blue', orient='v')
plt.tight_layout()
plt.figure(figsize=(12, 6))
for i in range(0, len(lrfmc)):
plt.subplot(2, 3, i+1) # len(lrfmc), i+1
sns.kdeplot(x=dfgo[lrfmc[i]], color = 'green')
plt.xlabel(lrfmc[i])
plt.tight_layout()
# Penghapusan outlier dengan IQR
print("Jumlah sebelum filter: ", len(dfgo_feat))
for i in lrfmc:
Q1 = dfgo_feat[i].quantile(0.25)
Q3 = dfgo_feat[i].quantile(0.75)
IQR = Q3-Q1
low_limit = Q1 - (1.5 * IQR)
high_limit = Q3 + (1.5 * IQR)
filtered_entries = ((dfgo_feat[i] >= low_limit) & (dfgo_feat[i] <= high_limit))
dfgo_feat = dfgo_feat[filtered_entries]
print("Jumlah setelah filter:", len(dfgo_feat))
Jumlah sebelum filter: 55229 Jumlah setelah filter: 46589
# Cek outlier lagi
plt.figure(figsize=(12, 6))
for i in range(len(lrfmc)):
plt.subplot(1, len(lrfmc), i+1)
sns.boxplot(y=dfgo[lrfmc[i]], color='blue', orient='v')
plt.tight_layout()
# Melihat sebaran data setelah handle outlier dengan iqr
plt.figure(figsize=(12, 6))
for i in range(0, len(lrfmc)):
plt.subplot(2, 3, i+1) # len(lrfmc), i+1
sns.kdeplot(x=dfgo[lrfmc[i]], color = 'green')
plt.xlabel(lrfmc[i])
plt.tight_layout()
# Assign nilai dataframe baru
lrfmc = ['TIME_MONTH', 'LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']
dfgo_feat = dfgo[lrfmc].copy()
dfgo_feat
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| 0 | 90 | 1 | 210 | 580717 | 0.961639 |
| 4 | 60 | 5 | 152 | 309928 | 0.970658 |
| 5 | 74 | 79 | 92 | 294585 | 0.967692 |
| 6 | 97 | 1 | 101 | 287042 | 0.965347 |
| 7 | 48 | 3 | 73 | 287230 | 0.962070 |
| ... | ... | ... | ... | ... | ... |
| 62971 | 13 | 360 | 2 | 1096 | 0.250000 |
| 62972 | 39 | 180 | 2 | 1914 | 0.140000 |
| 62973 | 12 | 357 | 2 | 760 | 0.350000 |
| 62976 | 45 | 39 | 2 | 1062 | 0.225000 |
| 62978 | 36 | 282 | 2 | 760 | 0.280000 |
55229 rows × 5 columns
# Using transformation log
# And doing "shift-and-log" approach to makesure no data became infinite after log transformation
dfgo2 = pd.DataFrame()
for i in lrfmc:
dfgo2[i] = dfgo_feat[i] + 0.001
dfgo2[i] = np.log(dfgo2[i])
# Melihat sebaran data setelah di transform log
plt.figure(figsize=(12, 6))
for i in range(0, len(lrfmc)):
plt.subplot(2, 3, i+1) # len(lrfmc), i+1
sns.kdeplot(x=dfgo2[lrfmc[i]], color = 'green')
plt.xlabel(lrfmc[i])
plt.tight_layout()
plt.figure(figsize=(12, 6))
for i in range(len(lrfmc)):
plt.subplot(1, len(lrfmc), i+1)
sns.boxplot(y=dfgo2[lrfmc[i]], color='blue', orient='v')
plt.tight_layout()
Setelah melakukan beberapa iterasi, diputuskan melakukan beberapa proses berbeda dalam usaha menghilangkan extream value dan menormalkan sebaran, diantaranya:
# Assign nilai baru ke dataframe
lrfmc = ['TIME_MONTH', 'LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']
dfgo_feat = dfgo[lrfmc].copy()
dfgo_feat
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| 0 | 90 | 1 | 210 | 580717 | 0.961639 |
| 4 | 60 | 5 | 152 | 309928 | 0.970658 |
| 5 | 74 | 79 | 92 | 294585 | 0.967692 |
| 6 | 97 | 1 | 101 | 287042 | 0.965347 |
| 7 | 48 | 3 | 73 | 287230 | 0.962070 |
| ... | ... | ... | ... | ... | ... |
| 62971 | 13 | 360 | 2 | 1096 | 0.250000 |
| 62972 | 39 | 180 | 2 | 1914 | 0.140000 |
| 62973 | 12 | 357 | 2 | 760 | 0.350000 |
| 62976 | 45 | 39 | 2 | 1062 | 0.225000 |
| 62978 | 36 | 282 | 2 | 760 | 0.280000 |
55229 rows × 5 columns
# Transformation log untuk 'TIME_MONTH', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM'
# And doing "shift-and-log" approach to makesure no data became infinite after log transformation
list_log = ['TIME_MONTH', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM']
for i in list_log:
dfgo_feat[i] = dfgo_feat[i] + 0.001
dfgo_feat[i] = np.log(dfgo_feat[i])
# Penghapusan outlier dengan IQR untuk avg_discount
print("Jumlah sebelum filter: ", len(dfgo_feat))
Q1 = dfgo_feat['avg_discount'].quantile(0.25)
Q3 = dfgo_feat['avg_discount'].quantile(0.75)
IQR = Q3-Q1
low_limit = Q1 - (1.5 * IQR)
high_limit = Q3 + (1.5 * IQR)
filtered_entries = ((dfgo_feat['avg_discount'] >= low_limit) & (dfgo_feat['avg_discount'] <= high_limit))
dfgo_feat = dfgo_feat[filtered_entries]
print("Jumlah setelah filter:", len(dfgo_feat))
Jumlah sebelum filter: 55229 Jumlah setelah filter: 54783
# Cek Boxplot after processing outlier
plt.figure(figsize=(12, 6))
for i in range(len(lrfmc)):
plt.subplot(1, len(lrfmc), i+1)
sns.boxplot(y=dfgo_feat[lrfmc[i]], color='blue', orient='v')
plt.tight_layout()
# Melihat sebaran data setelah processing outlier
plt.figure(figsize=(12, 6))
for i in range(0, len(lrfmc)):
plt.subplot(2, 3, i+1) # len(features), i+1
sns.kdeplot(x=dfgo_feat[lrfmc[i]], color = 'green')
plt.xlabel(lrfmc[i])
plt.tight_layout()
Sebaran data diatas adalah sebaran data "Terbaik" yang berhasil dibentuk
dfgo_feat.sample(10)
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| 44482 | 4.605180 | 6.200511 | 1.609638 | 8.802071 | 0.559116 |
| 34097 | 2.833272 | 0.001000 | 2.197336 | 9.096724 | 0.691219 |
| 18882 | 3.401231 | 0.693647 | 2.484990 | 9.748762 | 0.755771 |
| 44661 | 4.025370 | 5.407176 | 0.693647 | 8.666130 | 0.635585 |
| 54217 | 4.369461 | 5.049862 | 1.098946 | 7.746301 | 0.920000 |
| 9542 | 2.944492 | 1.386544 | 2.995782 | 10.372491 | 0.709995 |
| 27053 | 4.394462 | 1.791926 | 1.791926 | 9.322329 | 0.770678 |
| 31593 | 4.605180 | 2.639129 | 1.946053 | 9.226509 | 0.683702 |
| 19082 | 4.304079 | 2.995782 | 3.218916 | 9.532931 | 0.928035 |
| 41394 | 3.850169 | 5.673327 | 2.197336 | 9.179159 | 0.448643 |
data_std = StandardScaler().fit_transform(dfgo_feat)
dfgo_std = pd.DataFrame(data=data_std, columns = lrfmc)
dfgo_std
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| 0 | 1.296328 | -2.817974 | 3.538403 | 4.007522 | 1.894154 |
| 1 | 0.651695 | -1.775012 | 3.195806 | 3.382750 | 1.959232 |
| 2 | 0.985121 | 0.014321 | 2.663626 | 3.332233 | 1.937834 |
| 3 | 1.415411 | -2.817974 | 2.762551 | 3.306424 | 1.920907 |
| 4 | 0.296929 | -2.106119 | 2.418436 | 3.307075 | 1.897265 |
| ... | ... | ... | ... | ... | ... |
| 54778 | -0.347697 | 1.200168 | -1.393962 | -3.319448 | 0.367003 |
| 54779 | 1.658147 | 0.766391 | -1.393962 | -3.319448 | 0.367003 |
| 54780 | -1.661955 | 1.091379 | -1.393962 | -3.319448 | 0.367003 |
| 54781 | 0.754302 | 1.094489 | -1.393962 | -3.319448 | 0.367003 |
| 54782 | -1.907018 | 0.992215 | -1.393962 | -2.597854 | -2.519327 |
54783 rows × 5 columns
dfgo_std.describe()
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | |
|---|---|---|---|---|---|
| count | 5.478300e+04 | 5.478300e+04 | 5.478300e+04 | 5.478300e+04 | 5.478300e+04 |
| mean | -1.326241e-15 | 3.169927e-14 | 2.349026e-13 | -6.119462e-15 | 2.912581e-15 |
| std | 1.000009e+00 | 1.000009e+00 | 1.000009e+00 | 1.000009e+00 | 1.000009e+00 |
| min | -1.907018e+00 | -2.817974e+00 | -1.393962e+00 | -3.319448e+00 | -2.703884e+00 |
| 25% | -8.727230e-01 | -6.581611e-01 | -9.643720e-01 | -7.400917e-01 | -6.499494e-01 |
| 50% | 7.065632e-03 | 2.049288e-01 | -6.649427e-02 | -1.484605e-02 | 4.187086e-02 |
| 75% | 9.193241e-01 | 7.866531e-01 | 7.412429e-01 | 7.177402e-01 | 6.901487e-01 |
| max | 1.672155e+00 | 1.456872e+00 | 3.538403e+00 | 4.007522e+00 | 2.170960e+00 |
inertia = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(dfgo_std.values)
inertia.append(kmeans.inertia_)
sns.lineplot(x=range(1,11), y=inertia, linewidth=4)
sns.scatterplot(x=range(1,11), y=inertia, s=58)
<AxesSubplot:>
pd.Series(inertia) - pd.Series(inertia).shift(-1)
0 90525.310000 1 28143.996708 2 18443.841919 3 16641.430828 4 9613.630863 5 7601.269215 6 7015.543842 7 5482.624934 8 4430.628859 9 NaN dtype: float64
Berdasarkan elbow method, jumlah cluster efisien di 3 cluster. Akan tetapi, dari visualisasinya belum meyakinkan soal pemilihan clusternya. Oleh sebab itu, akan dilakukan visualisasi distortion score dan shilouette score elbow untuk menentukan jumlah cluster yang tepat.
# visualisasi innertia vs k dengan parameter distortion
from yellowbrick.cluster import KElbowVisualizer
# fit model
model = KMeans(random_state=42)
visualizer = KElbowVisualizer(model, k=(2,11), metric='distortion', timings=True, locate_elbow=True)
visualizer.fit(dfgo_std)
visualizer.show()
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
model = KMeans(random_state=42)
visualizer = KElbowVisualizer(model, k=(2,11), metric='silhouette', timings=True, locate_elbow=True)
visualizer.fit(dfgo_std)
visualizer.show()
<AxesSubplot:title={'center':'Silhouette Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='silhouette score'>
Pada visualisasi distortion score dan shilouette score, nilai k atau n_cluster yang diperoleh berbeda, sehingga kita akan membandingkan nilai average shilouette score pada range 2 sampai 5.
# silhouette plot
from yellowbrick.cluster import SilhouetteVisualizer
for i in [2,3,4,5]:
model = KMeans(i, random_state=42)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(dfgo_std)
visualizer.show()
from sklearn.metrics import silhouette_score
for num_clusters in [2,3,4,5]:
#inisialisasi kmeans
model_clus = KMeans(n_clusters = num_clusters, max_iter = 1000, random_state = 42)
model_clus.fit(dfgo_std)
cluster_labels = model_clus.labels_
#shilhouette score
silhouette_avg = silhouette_score(dfgo_std, cluster_labels)
print("For n_clusters = {0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
For n_clusters = 2, the silhouette score is 0.2825605672062411 For n_clusters = 3, the silhouette score is 0.2345352812857464 For n_clusters = 4, the silhouette score is 0.21029068939003048 For n_clusters = 5, the silhouette score is 0.21405154698089732
Dari hasil output di atas, jumlah cluster yang memiliki komposisi yang seimbang dengan setiap anggota cluster dan memperhatikan average shilouette score dari masing-masing cluster adalah 4 cluster. Walaupun 5 cluster memiliki average shilouette score yang lebih tinggi dibandingkan 4 cluster, 5 cluster memiliki komposisi tiap anggota cluster yang lebih sedikit dibandingkan dengan 4 cluster, sehingga model ini lebih efisien jika cluster yang digunakan adalah 4 cluster
#kmeans = KMeans(n_clusters = 4, max_iter = 1000, algorithm = 'elkan', random_state = 42)
kmeans = KMeans(n_clusters = 4, max_iter = 300, random_state = 42)
kmeans.fit(dfgo_std.values)
KMeans(n_clusters=4, random_state=42)
#Menampilkan label ke dataset
dfgo_std['label'] = kmeans.labels_
dfgo_std
| TIME_MONTH | LAST_TO_END | FLIGHT_COUNT | SEG_KM_SUM | avg_discount | label | |
|---|---|---|---|---|---|---|
| 0 | 1.296328 | -2.817974 | 3.538403 | 4.007522 | 1.894154 | 2 |
| 1 | 0.651695 | -1.775012 | 3.195806 | 3.382750 | 1.959232 | 2 |
| 2 | 0.985121 | 0.014321 | 2.663626 | 3.332233 | 1.937834 | 2 |
| 3 | 1.415411 | -2.817974 | 2.762551 | 3.306424 | 1.920907 | 2 |
| 4 | 0.296929 | -2.106119 | 2.418436 | 3.307075 | 1.897265 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 54778 | -0.347697 | 1.200168 | -1.393962 | -3.319448 | 0.367003 | 3 |
| 54779 | 1.658147 | 0.766391 | -1.393962 | -3.319448 | 0.367003 | 3 |
| 54780 | -1.661955 | 1.091379 | -1.393962 | -3.319448 | 0.367003 | 3 |
| 54781 | 0.754302 | 1.094489 | -1.393962 | -3.319448 | 0.367003 | 3 |
| 54782 | -1.907018 | 0.992215 | -1.393962 | -2.597854 | -2.519327 | 0 |
54783 rows × 6 columns
#PCA - split
X = dfgo_std.copy().drop(['label'], axis = 1)
Y = dfgo_std['label'].copy()
scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)
#PCA
pca = PCA(n_components = 2)
pca.fit(X_std)
X_pca = pca.transform(X_std)
pdf = pd.DataFrame(X_pca, columns = ['1', '2'])
pdf['label'] = Y
pdf.sample(5)
| 1 | 2 | label | |
|---|---|---|---|
| 51624 | -1.931775 | -1.620259 | 3 |
| 43273 | -2.280904 | 0.271682 | 3 |
| 17359 | 0.621228 | 0.924121 | 1 |
| 36123 | -1.353663 | 0.656963 | 0 |
| 22563 | 0.390142 | -1.421616 | 3 |
pdf.describe()
| 1 | 2 | label | |
|---|---|---|---|
| count | 5.478300e+04 | 5.478300e+04 | 54783.000000 |
| mean | 5.783503e-15 | -9.025591e-17 | 1.433018 |
| std | 1.552467e+00 | 1.008875e+00 | 1.117090 |
| min | -3.649351e+00 | -3.179294e+00 | 0.000000 |
| 25% | -1.213490e+00 | -6.816991e-01 | 1.000000 |
| 50% | -1.242341e-01 | -2.768355e-03 | 1.000000 |
| 75% | 1.081889e+00 | 6.756587e-01 | 3.000000 |
| max | 6.307129e+00 | 3.302354e+00 | 3.000000 |
pdf.groupby('label').agg(['mean', 'median', 'std', 'count'])
| 1 | 2 | |||||||
|---|---|---|---|---|---|---|---|---|
| mean | median | std | count | mean | median | std | count | |
| label | ||||||||
| 0 | -1.215342 | -1.190803 | 0.797192 | 12693 | 0.941002 | 0.884314 | 0.721519 | 12693 |
| 1 | 0.749594 | 0.752920 | 0.703051 | 20438 | 0.076088 | 0.065926 | 0.803862 | 20438 |
| 2 | 2.655066 | 2.553642 | 0.697456 | 6889 | 0.054931 | 0.046214 | 0.741988 | 6889 |
| 3 | -1.231769 | -1.228394 | 0.794584 | 14763 | -0.940029 | -0.888888 | 0.723582 | 14763 |
sns.scatterplot(data=pdf, x='1', y='2', hue='label', palette='RdYlBu')
plt.show()
print(abs(pca.components_))
[[0.21012478 0.48010006 0.60682803 0.59116396 0.08738662] [0.43175628 0.17499634 0.02936798 0.1108952 0.87738394]]
dfgo_std.columns
Index(['TIME_MONTH', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM',
'avg_discount', 'label'],
dtype='object')
Dari masing-masing fitur berkaitan dengan model L(Loyalty), R(Recency), F(Frequency), M(Monetery), dan C(Discount) sebagai berikut:
# Mengembalikan ke data aslinya sebelum dilakukan log transformasi
list_log = ['TIME_MONTH', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM']
for i in list_log:
dfgo_feat[i] = np.exp(dfgo_feat[i])
dfgo_feat[i] = dfgo_feat[i] - 0.001
# mengganti nama feature
dfgo_feat.columns =['L', 'R', 'F', 'M', 'C']
dfgo_feat
| L | R | F | M | C | |
|---|---|---|---|---|---|
| 0 | 90.0 | 1.0 | 210.0 | 580717.0 | 0.961639 |
| 4 | 60.0 | 5.0 | 152.0 | 309928.0 | 0.970658 |
| 5 | 74.0 | 79.0 | 92.0 | 294585.0 | 0.967692 |
| 6 | 97.0 | 1.0 | 101.0 | 287042.0 | 0.965347 |
| 7 | 48.0 | 3.0 | 73.0 | 287230.0 | 0.962070 |
| ... | ... | ... | ... | ... | ... |
| 62962 | 32.0 | 492.0 | 2.0 | 368.0 | 0.750000 |
| 62963 | 113.0 | 252.0 | 2.0 | 368.0 | 0.750000 |
| 62964 | 14.0 | 416.0 | 2.0 | 368.0 | 0.750000 |
| 62965 | 64.0 | 418.0 | 2.0 | 368.0 | 0.750000 |
| 62973 | 12.0 | 357.0 | 2.0 | 760.0 | 0.350000 |
54783 rows × 5 columns
dfgo_feat['label'] = kmeans.labels_
dfgo_feat.groupby('label').agg(['mean', 'median', 'std'])
| L | R | F | M | C | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| mean | median | std | mean | median | std | mean | median | std | mean | median | std | mean | median | std | |
| label | |||||||||||||||
| 0 | 42.829749 | 36.0 | 25.585817 | 250.375167 | 205.0 | 188.075646 | 4.134011 | 4.0 | 2.192462 | 6497.739857 | 5506.0 | 4126.954271 | 0.534159 | 0.546655 | 0.088757 |
| 1 | 47.313583 | 38.5 | 28.636732 | 90.988893 | 64.0 | 92.476246 | 13.854046 | 12.0 | 7.258350 | 20582.663861 | 17466.5 | 12592.203023 | 0.708401 | 0.707820 | 0.102547 |
| 2 | 59.257657 | 55.0 | 27.349370 | 11.961823 | 8.0 | 14.409742 | 37.548701 | 33.0 | 21.459622 | 51373.108579 | 44137.0 | 32956.729115 | 0.736333 | 0.735097 | 0.098578 |
| 3 | 47.862223 | 41.0 | 27.399385 | 293.543047 | 253.0 | 194.463156 | 3.979408 | 3.0 | 2.162760 | 5331.111563 | 4513.0 | 3569.871163 | 0.810808 | 0.800000 | 0.093643 |
# persentase customer setiap cluster
cluster_count = dfgo_feat['label'].value_counts().reset_index()
cluster_count.columns = ['label', 'count']
cluster_count['percentage (%)'] = round((cluster_count['count']/len(dfgo_std))*100,2)
cluster_count = cluster_count.sort_values(by=['label']).reset_index(drop=True)
cluster_count
| label | count | percentage (%) | |
|---|---|---|---|
| 0 | 0 | 12693 | 23.17 |
| 1 | 1 | 20438 | 37.31 |
| 2 | 2 | 6889 | 12.58 |
| 3 | 3 | 14763 | 26.95 |
#visualisasi persentase customer pada setiap cluster
fig, ax = plt.subplots(figsize=(10,6))
c = ["#d7191c","#fdae61","#ffffbf","#abd9e9","#2c7bb6"]
bars = plt.bar(x=cluster_count['label'], height= cluster_count['percentage (%)'],color=c)
for bar in bars:
height = bar.get_height()
label_x_pos = bar.get_x() + bar.get_width() / 2
ax.text(label_x_pos, height, s=f'{height} %', ha='center',
va='bottom')
plt.title('Percentage of Customer by Cluster', fontsize=16)
plt.xlabel('Cluster',fontsize=12)
plt.ylabel('Percentage',fontsize=12)
Text(0, 0.5, 'Percentage')
# melihat pola LRFMC pada setiap cluster
cluster_med=dfgo_std.groupby('label').median().reset_index()
df_melt = pd.melt(cluster_med.reset_index(), id_vars='label', value_vars=lrfmc, var_name='Metric', value_name='Value')
plt.figure(figsize=(10,6))
sns.pointplot(data=df_melt, x='Metric', y='Value', hue='label', palette='RdYlBu')
plt.title('Pattern of Customer by LRFMC Model', fontsize=14)
plt.xlabel('Metric')
plt.ylabel('Value')
Text(0, 0.5, 'Value')
# melihat rata-rata variabel LRFMC antar cluster
LRFMC= ['L','R','F','M','C']
def dist_list(lst):
plt.figure(figsize=[len(lst)*5,3])
i = 1
for col in lst:
ax = plt.subplot(1,len(lst),i)
ax.vlines(dfgo_feat[col].median(), ymin=0, ymax=4, color='grey', linestyle='--')
g = dfgo_feat.groupby('label')
x = g[col].median().index
y = g[col].median().values
ax.barh(x,y, color=c)
plt.title(col)
i = i+1
dist_list(LRFMC)
Secara keseluruhan cluster-cluster ini terbentuk karena adanya penbedaan indikaror model LRFMC. Berdasarkan hasil analisis dan visualisai dapat diketahui karakteristik customer pada setiap kelompok sebagai berikut:
Tabel 2 - Akumulasi Hasil Nilai LRFMC pada Setiap Cluster
| Cluster | High Value | Average Value | Low Value |
|---|---|---|---|
| Cluster 0 | R | L F M C | |
| Cluster 1 | L F M C | R | |
| Cluster 2 | L F M | C | R |
| Cluster 3 | R C | L | F M |
Cluster 0 : Kelompok new passenger yang penggunaan maskapainya rendah, baru menjadi membership, penggunaan diskon yang rendah, dan jarak tempuh perjalanan yang dilakukan juga terbilang masih kecil.
Cluster 1 : Kelompok passenger menengah (potential loyal passenger) dengan lama membership, frekuensi penggunaan maskapai, jarak tempuh perjalanan, dan penggunaan diskon yang terbilang tidak tinggi dan juga tidak rendah (menengah), serta rentang waktu dari waktu penerbangan terakhir dengan pesanan terakhir yang dilakukan cukup rendah.
Cluster 2 : Kelompok loyal passenger dengan membership di maskapai yang sudah cukup lama, sering meggunakan maskapai, dan jarak tempuh perjalanan yang dilakukan sudah sangat tinggi. Walaupun penggunaan diskon yang tergolong menengah, recency/rentang waktu dari waktu penerbangan terakhir dengan pesanan terakhir yang dilakukan rendah.
Cluster 3 : Kelompok passenger menengah dengan lama membership di maskapai menengah (tidak baru dan tidak lama juga sejak terhitung bergabung untuk menggunakan maskapai), penggunaan diskon yang cukup tinggi pada kelompok passenger ini dengan rentang waktu dari waktu penerbangan terakhir dengan pesanan terakhir yang dilakukan cukup tinggi, serta aktivitas penggunaan maskapai rendah dan jarak tempuh perjalanannya pun masih rendah (Kemungkinan kelompok passenger ini menggunakan maskapai jika ada promo. Jika tidak ada promo, maka kemungkinan tidak melakukan transaksi di maskapai).